ETL data from:
1) Fannie Mae Loan Acquistion and Performance Data [Individual Mortgage Loans],
2) Past Fannie Mae Loan Acquistion Data [Changes in Loan Amounts],
3) U.S. Census Bureau, Small Area Estimates Branch [Median Household Income by County],
4) Federal Reserve Economic Data (FRED) [Macroeconomic Data related to the Housing Market],
5) Federal Deposit Insurance Corporation (FDIC) Data [Information on FDIC-backed Banks]
%run Functions.ipynb
pd.set_option("display.max_columns", 200)
pd.set_option('display.max_rows', 200)
# Collect file names
fld = '..\Data\\'
x = []
for file in os.listdir(fld):
filename = os.fsdecode(file)
if filename.startswith("Acq_200"):
x.append(fld + filename)
# Load data
df = pd.DataFrame()
for i in range(len(x)):
acq = pd.read_csv(x[i])
# Drop NA values in Original Date
acq = acq.loc[acq['Original Date'].notna(),:]
acq['File Year'] = int(x[i][12:16])
acq['File Quarter'] = x[i][16:18]
acq['Original Date'] = acq['Original Date'].astype(float).astype(int).astype(str)
acq['Original Date'] = acq['Original Date'].apply(lambda x: x.zfill(6))
acq['Month'] = acq['Original Date'].apply(lambda x: x[0:2])
acq['Year'] = acq['Original Date'].apply(lambda x: x[2:6]).astype(int)
acq['Original Date'] = (acq['Original Date'].apply(lambda x: x[0:2]) + '/' +
acq['Original Date'].apply(lambda x: x[2:6])).apply(str)
acq = acq.drop(labels=['Servicer', 'Master Servicer'], axis=1)
acq = acq.loc[acq['File Year'] <= 2008,:]
df = pd.concat([df, acq], ignore_index=True)
print('..Loaded year', x[i][12:16], 'quarter', x[i][16:18])
# Drop U.S. Terroritories due to missing data
df = df[df['Property State'] != 'PR']
df = df[df['Property State'] != 'GU']
df = df[df['Property State'] != 'VI']
df.head(10)
# Dates
## Pre-file date values indicate a mortgage loan refinnanced during the date value
df['Original Date'].value_counts()
# Co-Borrower Credit Score
df['Harmonized Credit Score'] = ( df.loc[df['Co-Borrower Credit Score'].notnull(), 'Co-Borrower Credit Score'] * 0.5 ) \
+ ( df['Credit Score'].loc[df['Co-Borrower Credit Score'].notnull()] * 0.5 )
df.loc[df['Co-Borrower Credit Score'].isnull(), 'Harmonized Credit Score'] = \
df.loc[df['Co-Borrower Credit Score'].isnull(), 'Credit Score']
df.loc[df['Harmonized Credit Score'].isnull(), 'Harmonized Credit Score'] = \
df.loc[df['Harmonized Credit Score'].isnull(), 'Credit Score']
print(df[['Harmonized Credit Score', 'Credit Score', 'Co-Borrower Credit Score']].head(10))
df = df.drop(labels=['Credit Score', 'Co-Borrower Credit Score'], axis=1)
df = df.rename(columns={'Harmonized Credit Score': 'Credit Score'})
# Mortgage Insurance %
df['Mortgage Insurance %'] = np.where(df['Mortgage Insurance %'].isna(), \
0, df['Mortgage Insurance %'])
df['Mortgage Insurance Type'] = np.where(df['Mortgage Insurance %']==0, \
1, 0)
df['Mortgage Insurance Type'].value_counts()
# Collapse Refinance
print('Original Distribution')
print(df['Loan Purpose'].value_counts())
df['Loan Purpose'] = np.where(df['Loan Purpose'] != 'P', \
1, 0)
print('')
print('New Distribution')
print(df['Loan Purpose'].value_counts())
# Recode Number of Borrowers
## Single Borrower binary
## More than one borrower is 0
print('Original Distribution')
print(df['Number of Borrowers'].value_counts())
df['Number of Borrowers'] = np.where(df['Number of Borrowers'] == 1, \
1, 0)
print('')
print('New Distribution')
df = df.rename(columns={'Number of Borrowers': 'Single Borrower'})
df['Single Borrower'].value_counts()
Goal is to keep bank values ~10,000
# Bank Values
df = df.rename({'Lender': 'Bank'}, axis='columns')
df.loc[df['File Year'] >= 2006, 'Bank'].value_counts()[:15]
# Recode Bank to keep bank values ~50,000
## Map similar names to bank
df = grep_bank_groupings(df, Bank = 'Bank')
df.loc[df['File Year'] >= 2006, 'Bank'].value_counts()[:15]
# Target Variable with all data
target_values(df.loc[df['File Year'] >= 2006, 'Foreclosed'], data=True)
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
'AmTrust Bank','PNC Bank','Flagstar Bank']
## Group other banks
df['Bank'] = np.where((df['Bank'].apply(lambda x: x in banks)), df['Bank'], 'Other')
file_to_store = open("..\Data\Pickle\df_load.pkl", "wb")
pickle.dump(df, file_to_store)
file_to_store.close()
file_to_open = open('..\Data\Pickle\df_load.pkl', 'rb')
df = pickle.load(file_to_open)
file_to_open.close()
# Drop other banks
df = df.loc[df['Bank'] != 'Other',:]
df_crisis = df.loc[(df['File Year'] >= 2006) & (df['File Year'] <= 2008),:]
df_precrisis = df.loc[(df['File Year'] <= 2005),:]
# Create Target Variable with Merge
print('\nThe number of features is:\n', df_crisis.shape[1], sep='')
print('\nThe number of observations is:\n', df_crisis.shape[0], sep='')
target_values(df_crisis['Foreclosed'], data=True)
# Aggregate total loans by collapsing by Banks and Date
loan_lev = df.groupby(['Bank', 'File Year', 'File Quarter', 'Zip Code']) \
.agg({'Original Mortgage Amount': 'mean'}).reset_index()
# Previous years
Qtr = ['Q1', 'Q2', 'Q3', 'Q4']
Yr = [2006, 2007, 2008]
loan_lev_all_yrs = pd.DataFrame()
for yr in Yr:
for qtr in Qtr:
subset = loan_lev.loc[((loan_lev['File Quarter']==qtr) & (loan_lev['File Year']==yr)),:]
subset_1yr = loan_lev.loc[((loan_lev['File Quarter']==qtr) & (loan_lev['File Year']==(yr-1))),:]
subset_5yr = loan_lev.loc[((loan_lev['File Quarter']==qtr) & (loan_lev['File Year']==(yr-5))),:]
loan_lev_stacks = pd.merge(subset, subset_1yr, on=['Bank', 'Zip Code'],
how='outer', suffixes=('', ' (1 Yr)'), copy=True)
loan_lev_stacks = pd.merge(loan_lev_stacks, subset_5yr, on=['Bank', 'Zip Code'],
how='outer', suffixes=('', ' (5 Yr)'), copy=True)
loan_lev_stacks['Loan Change (1 Year)'] = \
loan_lev_stacks['Original Mortgage Amount'] - loan_lev_stacks['Original Mortgage Amount (1 Yr)']
loan_lev_stacks['Loan Change (5 Years)'] = \
loan_lev_stacks['Original Mortgage Amount'] - loan_lev_stacks['Original Mortgage Amount (5 Yr)']
loan_lev_all_yrs = pd.concat([loan_lev_all_yrs, loan_lev_stacks], axis=0)
# Merge
drop_vars = ['File Year (1 Yr)', 'File Quarter (1 Yr)', 'Original Mortgage Amount (1 Yr)',
'File Year (5 Yr)', 'File Quarter (5 Yr)', 'Original Mortgage Amount (5 Yr)']
loan_lev_all_yrs = loan_lev_all_yrs.drop(labels=drop_vars, axis=1)
df = pd.merge(df_crisis, loan_lev_all_yrs, on=['Bank', 'Zip Code', 'File Year', 'File Quarter'], how='inner', copy=True)
df = df.drop(labels=['Original Mortgage Amount_y'], axis=1)
df = df.rename(columns={'Original Mortgage Amount_x': 'Original Mortgage Amount'})
# 1 Year change in total loans
print('Average change in 1 year: $', np.round(np.mean(df['Loan Change (1 Year)']),2))
# 5 year change in total loans
print('Average change in 5 years: $', np.round(np.mean(df['Loan Change (5 Years)']),2))
print('Shape:', df.shape)
ETL County-level median household income from U.S. Census
Aggregate on 3-digit zipcode
# Import zipcode median household income
income = pd.read_excel("..\Data\est07all.xls",
sheet_name = 'est07ALL', header = 2)
income = income[['Name', 'Median Household Income']]
income = income.rename(columns={'Name': 'County'})
# Import county zipcode crosswalk
crosswalk = pd.read_csv("..\Data\ZIP-COUNTY-FIPS_2017-06.csv",
header = 0)
crosswalk = crosswalk[['ZIP', 'COUNTYNAME', 'STCOUNTYFP']]
crosswalk = crosswalk.rename(columns={'ZIP': 'Zip Code', 'COUNTYNAME': 'County', 'STCOUNTYFP': 'FIPS'})
crosswalk['Zip Code'] = crosswalk['Zip Code'].astype(str)
crosswalk['Zip Code'] = crosswalk['Zip Code'].str.slice(start=0, stop=-2)
crosswalk['Zip Code'] = crosswalk['Zip Code'].astype(int)
# Merge
county_level = pd.merge(income, crosswalk, on='County', how='outer')
county_level.head(10)
# Merge zipcode crosswalk with
FIPS_merge = county_level[['FIPS', 'Zip Code']]
FIPS_merge = FIPS_merge.groupby('Zip Code').agg({'FIPS': 'first'})
income_merge = county_level[['Median Household Income', 'Zip Code']]
income_merge = income_merge.groupby('Zip Code').agg({'Median Household Income': 'mean'})
df = pd.merge(df, FIPS_merge, on='Zip Code', how='left')
df = pd.merge(df, income_merge, on='Zip Code', how='left')
df['Median Household Income'].describe()
# State to Region Conversion
df = to_region(df, 'Property State')
df['Region'].value_counts()
# Household Financial Obligations as a percent of Disposable Personal Income (FODSP)
# Consumer Debt Service Payments as a Percent of Disposable Personal Income (CDSP)
# S&P/Case-Shiller U.S. National Home Price Index (CSUSHPINSA)
# Mortgage Debt Service Payments as a Percent of Disposable Personal Income (MDSP)
# Monthly Supply of Houses in the United States (MSACSR)
# Homeowner Vacancy Rate for the United States (RHVRUSQ156N)
fred_df = ['FODSP', 'CDSP', 'CSUSHPINSA', 'MDSP', 'MSACSR']
fred_name = ['Household Financial Obligations', 'Consumer Debt Service Payment',
'National Home Price Index', 'Mortgage Debt Service Payments', 'Monthly Supply of Houses']
fred_freq = ['qtr', 'qtr', 'yr', 'qtr', 'yr']
for i in range(len(fred_df)):
fred_tmp = pd.read_csv('..\Data\FRED\\' + fred_df[i] + '.csv', header = 0)
if fred_freq[i]=='qtr':
df = fred_merge(fred_tmp, df, quarter=True, varname=fred_name[i])
else: df = fred_merge(fred_tmp, df, quarter=False, varname=fred_name[i])
print('Shape:', df.shape)
display(df.tail())
# Housing Inventory Estimate: Vacant Housing Units for Sale (ESALEUSQ176N)
# Homeownership Rate for the United States (RHORUSQ156N)
# Housing Inventory Estimate: Vacant Housing Units for Rent (ERENTUSQ176N)
# Rental Vacancy Rate for the United States (RRVRUSQ156N)
fred_df = ['ESALEUSQ176N', 'RHORUSQ156N', 'ERENTUSQ176N', 'RRVRUSQ156N']
fred_name = ['Vacant Housing Units for Sale', 'Homeownership Rate', 'Vacant Housing Units for Rent',
'Rental Vacancy Rate']
fred_freq = ['qtr', 'qtr', 'qtr', 'qtr']
for i in range(len(fred_df)):
sub = len(fred_df[i]) - 7
fred_prefix= fred_df[i][0:sub]
fred_suffix= fred_df[i][-5:]
fred_tmp = {}
for region in ['NE', 'SO', 'MW', 'WE']:
fred_tmp[region] = pd.read_csv('..\Data\FRED\\' + fred_prefix + region + fred_suffix + '.csv', header = 0)
if fred_freq[i]=='qtr':
df = fred_merge_region(NE = fred_tmp['NE'], SO = fred_tmp['SO'], MW = fred_tmp['MW'],
WE = fred_tmp['WE'], varname = fred_name[i], df_orig = df,
quarter=True)
else:
df = fred_merge_region(NE = fred_tmp['NE'], SO = fred_tmp['SO'], MW = fred_tmp['MW'],
WE = fred_tmp['WE'], varname = fred_name[i], df_orig = df,
quarter=False)
print('Shape:', df.shape)
display(df.tail())
# use pandas to construct a list of quarterly dates
present = '20081231'
datetimes = pd.date_range('19930331', end=present, freq='Q')
# get a list of zip files over which to iterate
zip_files = glob.glob('..\Data\FDIC\*.zip')
# only want to return a subset of cols
used_columns = ['name', 'repdte', 'asset', 'lnlsnet', 'liab', 'dep', 'eqtot', 'numemp']
used_dtypes = {'name': str, 'repdte': object, 'asset': float,
'lnlsnet': float, 'liab': float, 'eqtot': float, 'dep': float, 'numemp': float}
# create a container for the individual dataframes
dataframes = []
for zip_file in zip_files:
tmp_buffer = zipfile.ZipFile(zip_file)
# want to work with the assets and liabilities file
tmp_file = tmp_buffer.namelist()[5]
tmp_dataframe = pd.read_csv(tmp_buffer.open(tmp_file),
error_bad_lines=False, # skips the mangled obs
usecols=used_columns,
dtype=used_dtypes,
parse_dates=True)
dataframes.append(tmp_dataframe)
# concatenate the quarterly dataframes into a single data frame
fdic = pd.concat(dataframes)
# convert units from thousands to billions of USD
fdic[['asset', 'lnlsnet', 'liab', 'dep', 'eqtot']] /= 1e6
# convert units from nummber of people to thousands of people
fdic['numemp'] /= 1e3
# Map similar names to bank
fdic = grep_bank_groupings(fdic, Bank = 'name')
## Group other banks
fdic['Bank'] = np.where(fdic['Bank'].isnull(), 'Other', fdic['Bank'])
# Drop name
fdic = fdic.drop(labels=['name'], axis=1)
# Convert to panel
fdic = fdic.groupby(['Bank', 'repdte']).sum()
fdic = fdic.reset_index(drop=False)
# Fill monthly data
fdic = fdic_on_month(fdic)
fdic = fdic.groupby(['Bank', 'Original Date']).sum()
print('Shape:', fdic.shape)
display(fdic.tail())
# compute the by quarter totals for each measure
quarter_totals = fdic.groupby(['Original Date']).sum()
# compute the base quarter totals for each measure
totals_base_qtr = quarter_totals.iloc[0,:]
def janicki_prescott_norm(item):
"""
In order to make sure results are comparable across years, I follow
Janicki and Prescott (2006) and deflate and re-scale each measure of bank
size by dividing by banking sector totals relative to some base quarter.
Specifically, let :math:`S_{i,t}^{raw}` denote the raw size of bank :math:`i`
in year :math:`t` based on one of the six size measures detailed above. The
normalized size of bank :math:`i` relative to the base quarter is defined as
follows:
.. math::
S_{i,t}^{norm} = \frac{S_{i,t}^{raw}}{\sum_{j}S_{j,t}^{raw}}\sum_{j}S_{i,base}^{raw}
where :math:\sum_{j}S_{j,t}^{raw}` is the banking sector total of some size
measure in year :math:`t` (i.e., total banking sector assets in year :math:`t`),
and :math:`\sum_{j}S_{j,base}^{raw}` is the banking sector total of the same
size measure in the base quarter.
"""
return (fdic[column] / quarter_totals[column]) * totals_base_qtr[column]
# apply the Janicki and Prescott (2006) normalized size measure
for column in fdic.columns:
fdic[column] = janicki_prescott_norm(column)
# Period change
fdic = fdic.reset_index(drop=False)
for col in ['asset', 'lnlsnet', 'liab', 'dep', 'eqtot']:
fdic[[str(col.title() + ' (5 Yr)')]] = fdic[[col]].pct_change(60)
fdic[[str(col.title() + ' (1 Yr)')]] = fdic[[col]].pct_change(12)
# Drop total values
fdic = fdic.drop(labels=['asset', 'lnlsnet', 'liab', 'dep', 'eqtot'], axis=1)
# Rename number of employees
fdic = fdic.rename(columns={'numemp': 'Number of Employees'})
# Regroup
fdic = fdic.groupby(['Bank', 'Original Date']).sum()
display(fdic.tail())
# Merge
df = pd.merge(df, fdic, on=['Bank', 'Original Date'], how='left')
print('Shape:', df.shape)
display(df.tail())
# Missing
print( (df.isna().sum() / df.shape[0] * 100).round(2) )
file_to_store = open("..\Data\Pickle\df.pkl", "wb")
pickle.dump(df, file_to_store)
file_to_store.close()